## 红楼梦文本挖掘之数据预处理####
## 主要用于文本文档的读取和构建
## 文本预处理
## 孙玉林;2016年10月31
## 加载所需要的包
library(jiebaR)
## Loading required package: jiebaRD
library(tm)
## Loading required package: NLP
library(readr)
library(stringr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(GGally)
library(gridExtra)
library(scatterplot3d)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(fastcluster)
##
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
library(topicmodels)
library(LDAvis)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:plotly':
##
## %>%, groups
## The following object is masked from 'package:stringr':
##
## %>%
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## 读取所需要的文件####
# ## 读取红楼梦的词典
# filename <-"./数据/红楼梦诗词123.txt"
# dictionry <- read_csv(file = filename,col_names = FALSE)
# ## 对词典去重
# dictionry <- unique(dictionry$X1)
# write(dictionry,"./数据/红楼梦词典.txt",sep = "\n")
## 读取停用词
filename <- "./数据/我的红楼梦停用词.txt"
mystopwords <- readLines(filename)
## 读取红楼梦
filename <-"./数据/红楼梦UTF82.txt"
Red_dream <- readLines(filename,encoding='UTF-8')
## Warning in readLines(filename, encoding = "UTF-8"): 读'./数据/红楼梦
## UTF82.txt'时最后一行未遂
## 将读入的文档分章节####
#去除空白行
Red_dream <- Red_dream[!is.na(Red_dream)]
# Red_dream <- as.vector(Red_dream)
# Red_dream[is.na(Red_dream)]
## 删除卷数据
juan <- grep(Red_dream,pattern = "^第+.+卷")
Red_dream <- Red_dream[(-juan)]
## 找出每一章节的头部行数和尾部行数
## 每一章节的名字
Red_dreamname <- data.frame(name = Red_dream[grep(Red_dream,pattern = "^第+.+回")],
chapter = 1:120)
## 处理章节名
names <- data.frame(str_split(Red_dreamname$name,pattern = " ",simplify =TRUE))
Red_dreamname$chapter2 <- names$X1
Red_dreamname$Name <- apply(names[,2:3],1,str_c,collapse = ",")
## 每章的开始行数
Red_dreamname$chapbegin<- grep(Red_dream,pattern = "^第+.+回")
## 每章的结束行数
Red_dreamname$chapend <- c((Red_dreamname$chapbegin-1)[-1],length(Red_dream))
## 每章的段落长度
Red_dreamname$chaplen <- Red_dreamname$chapend - Red_dreamname$chapbegin
## 每章的内容
for (ii in 1:nrow(Red_dreamname)) {
## 将内容使用句号连接
chapstrs <- str_c(Red_dream[(Red_dreamname$chapbegin[ii]+1):Red_dreamname$chapend[ii]],collapse = "")
## 剔除不必要的空格
Red_dreamname$content[ii] <- str_replace_all(chapstrs,pattern = "[[:blank:]]",replacement = "")
}
## 每章节的内容
content <- Red_dreamname$content
Red_dreamname$content <- NULL
## 计算每章有多少个字
Red_dreamname$numchars <- nchar(content)
## 对红楼梦进行分词####
Red_fen <- jiebaR::worker(type = "mix",user = "./数据/红楼梦词典.txt")
Fen_red <- apply_list(as.list(content),Red_fen)
## 去除停用词,使用并行的方法
library(parallel)
cl <- makeCluster(4)
Fen_red <- parLapply(cl = cl,Fen_red, filter_segment,filter_words=mystopwords)
stopCluster(cl)
# Fen_red <- lapply(Fen_red, filter_segment,filter_words=mystopwords)
## 每章节最终有多少个词
Fen_red2 <- lapply(Fen_red, unique) #去重
Red_dreamname$wordlen <- unlist(lapply(Fen_red2,length))
## 添加分组变量,前80章为1组,后40章为2组
Red_dreamname$Group <- factor(rep(c(1,2),times = c(80,40)),
labels = c("前80章","后40章"))
## 词频统计##-----------------------------------------------------------
## 1:构建文档-词项频数矩阵
corpus <- Corpus(VectorSource(Fen_red))
Red_dtm <- DocumentTermMatrix(corpus,control = list(wordLengths=c(1,Inf)))
Red_dtm
## <<DocumentTermMatrix (documents: 120, terms: 40342)>>
## Non-/sparse entries: 117554/4723486
## Sparsity : 98%
## Maximal term length: 12
## Weighting : term frequency (tf)
## 一共有4万多个词
## 2:词频统计
word_freq <- sort(colSums(as.matrix(Red_dtm)),decreasing = TRUE)
word_freq <- data.frame(word = names(word_freq),freq=word_freq,row.names = NULL)
word_freq$word <- as.factor(word_freq$word)
head(word_freq)
## word freq
## 1 宝玉 3907
## 2 笑道 1955
## 3 贾母 1686
## 4 一个 1440
## 5 凤姐 1228
## 6 袭人 1152
## 绘制词频图
nn <- 250
sum(word_freq$freq>=nn)
## [1] 69
word_freq[word_freq$freq >= nn,] %>%
ggplot(aes(x = word,y = freq)) +
theme_bw(base_size = 12,base_family = "STKaiti") +
geom_bar(stat = "identity",fill= "red",colour = "lightblue",alpha = 0.6) +
scale_x_discrete() +
theme(axis.text.x = element_text(angle = 75,hjust = 1,size = 8)) +
labs(x = "词项",y = "频数",title = "《红楼梦》词频图")

## 词云
sum(word_freq$freq>=60)
## [1] 390
data.frame(word_freq[word_freq$freq>60,]) %>%
letterCloud("R",wordSize = 12)
## 静态词云
layout(matrix(c(1, 2), nrow=2), heights=c(0.4, 4))
par(mar=rep(0, 4),family = "STKaiti")
plot.new()
text(x=0.5, y=0.3, "红楼梦词云\nMin=60")
wordcloud(words = word_freq$word, freq = word_freq$freq,
scale = c(4,0.8),min.freq = 60,random.order=FALSE,
family = "STKaiti",colors = brewer.pal(8,"Dark2"))
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 史湘云 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 心下 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 只好 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 小姐 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 老祖宗 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 料理 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 林之孝家的 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 书房 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 又道 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 横竖 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 几日 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 五儿 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一一 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 怡红院 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 宝姐姐 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 笑话 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 在外 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 大事 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 脸上 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一番 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 好好 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 几天 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 两日 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 侄儿 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 工夫 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 赖大 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 热闹 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 想到 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 老人家 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一张 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 有个 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 男人 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 旁边 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一概 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 记得 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 说完 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 因见 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 雨村 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 金桂 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 闲话 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一块 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 正经 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 红玉 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 贾瑞 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 又说 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 出门 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 床上 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 夫人 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 几句 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 里间 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 说什么 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 跪下 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 好歹 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 门上 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 女孩儿 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 生日 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 时常 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 叔叔 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 头里 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 这么着 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 着急 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 贾蔷 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一位 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 姨妈 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 他家 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 未了 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 眼睛 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 不在话下 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 当日 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 渐渐 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 命人 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 却说 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 潇湘馆 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 便知 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 叹道 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 无奈 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 走到 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 抱怨 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 欢喜 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 拿来 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 女孩子 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 送来 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 未免 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一人 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 大老爷 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 凤丫头 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 了不得 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 名字 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 商量 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 虽是 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 屋子 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 放下 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 尚未 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 听得 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一顿 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 暂且 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 仔细 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 不该 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 不住 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 二姐 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 放在 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 回到 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 近日 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 穿着 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 可怜 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 林之孝 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一把 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一阵 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 因笑道 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 照应 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 之物 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 可惜 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 心内 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = word_freq$word, freq = word_freq$freq, scale =
## c(4, : 一早 could not be fit on page. It will not be plotted.

## 动态词云
data.frame(word_freq[word_freq$freq>60,]) %>%
wordcloud2(color = 'random-dark',backgroundColor = "whirt",
shape = 'star' )
## 对每章的内容进行探索分析####
## 对相关章节进行分析
## 每章节的段落长度
p1 <- ggplot(Red_dreamname,aes(x = chapter,y = chaplen)) +
theme_bw(base_family = "STKaiti",base_size = 10) +
geom_point(colour = "red",size = 1) +
geom_line() +
geom_text(aes(x = 25,y = 0.9*max(Red_dreamname$chaplen)),
label="前80章",family = "STKaiti",colour = "Red") +
geom_text(aes(x = 100,y = 0.9*max(Red_dreamname$chaplen)),
label="后40章",family = "STKaiti",colour = "Red") +
geom_vline(xintercept = 80.5,colour = "blue") +
labs(x = "章节",y = "段数",title = "《红楼梦》每章段数")
## 每章节的字数
p2 <- ggplot(Red_dreamname,aes(x = chapter,y = numchars)) +
theme_bw(base_family = "STKaiti",base_size = 10) +
geom_point(colour = "red",size = 1) +
geom_line() +
geom_text(aes(x = 25,y = 0.9*max(Red_dreamname$numchars)),
label="前80章",family = "STKaiti",colour = "Red") +
geom_text(aes(x = 100,y = 0.9*max(Red_dreamname$numchars)),
label="后40章",family = "STKaiti",colour = "Red") +
geom_vline(xintercept = 80.5,colour = "blue") +
labs(x = "章节",y = "字数",title = "《红楼梦》每章字数")
p3 <- ggplot(Red_dreamname,aes(x = chapter,y = wordlen)) +
theme_bw(base_family = "STKaiti",base_size = 10) +
geom_point(colour = "red",size = 1) +
geom_line() +
geom_text(aes(x = 25,y = 0.9*max(Red_dreamname$wordlen)),
label="前80章",family = "STKaiti",colour = "Red") +
geom_text(aes(x = 100,y = 0.9*max(Red_dreamname$wordlen)),
label="后40章",family = "STKaiti",colour = "Red") +
geom_vline(xintercept = 80.5,colour = "blue") +
labs(x = "章节",y = "词数",title = "《红楼梦》每章词数")
## 绘制每一章节的平行坐标图
p4 <- ggparcoord(Red_dreamname,columns = 7:9,scale = "center",
groupColumn = "Group",showPoints = TRUE,
title = "《红楼梦》") +
theme_bw(base_family = "STKaiti",base_size = 10) +
theme(legend.position = "bottom",axis.title.x = element_blank()) +
scale_x_discrete(labels = c("断落数","字数","词数")) +
ylab("中心化数据大小")
gridExtra::grid.arrange(p1,p2,p3,p4,ncol = 2)

## 对三个变量绘制三散点图,
par(family = "STKaiti",mfcol = c(1,1))
color <- rep(c("red","blue"),times = c(80,40))
pchs <- rep(c(21,22),times = c(80,40))
scatterplot3d(x =Red_dreamname$chaplen,y = Red_dreamname$numchars,
z=Red_dreamname$wordlen,color = color,pch = pchs,
xlab="断落数", ylab="字数", zlab="词数",scale.y=1,
angle=30,main = "《红楼梦》")
legend("topleft", inset=.05, # location and inset
bty="n", cex=.8, # suppress legend box, shrink text 50%
title="章节",
legend = c("前80章","后40章"),
pch = c(21,22),
col = c("red","blue"))

## 可交互三维散点图
plot_ly(Red_dreamname, x = ~chaplen, y = ~numchars, z = ~wordlen) %>%
add_markers(color = ~Group,text = ~paste("Name: ", name)) %>%
layout(title = "《红楼梦》")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
##矩阵散点图
Red_dreamname_mat <- Red_dreamname[c("chaplen","numchars","wordlen","Group")]
names(Red_dreamname_mat) <- c("断落数","字数","词数","章节")
ggscatmat(Red_dreamname_mat,columns = c("断落数","字数","词数"),color = "章节") +
theme_bw(base_family = "STKaiti") +
ggtitle("《红楼梦》")

## 三个变量进行聚类分析
## 对每章节进行聚类分析####
## 1:构建文档-词项tf-IDF矩阵
corpus2 <- Corpus(VectorSource(Fen_red))
Red_dtm_tfidf <- DocumentTermMatrix(corpus2,control = list(wordLengths=c(1,Inf),
weighting = weightTfIdf))
Red_dtm_tfidf
## <<DocumentTermMatrix (documents: 120, terms: 40342)>>
## Non-/sparse entries: 117314/4723726
## Sparsity : 98%
## Maximal term length: 12
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## 一共有4万多个词
## 降低tfidf矩阵的稀疏度
Red_dtm_tfidfr <- removeSparseTerms(Red_dtm_tfidf,0.95)
Red_dtm_tfidfr
## <<DocumentTermMatrix (documents: 120, terms: 3093)>>
## Non-/sparse entries: 59053/312107
## Sparsity : 84%
## Maximal term length: 6
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## 只留下了3000多个关键的字
## 使用系统聚类对每个章节进行聚类
Red_dtm_tfidfr_mat <- as.matrix(Red_dtm_tfidfr)
## 文本间的距离度量为夹角余弦距离
Red_dtm_tfidfr_dist <- proxy::dist(Red_dtm_tfidfr_mat,method ="cosine")
## 系统聚类,聚为两类
k = 6
Red_clust <- hclust(d = Red_dtm_tfidfr_dist,method = "average")
Red_clust$labels <- Red_dreamname$chapter2
## 可视化绘图
par(family = "STKaiti",cex = 0.6)
plot(Red_clust,
main = '红楼梦章节聚类\nmethod = average',
xlab = '', ylab = '', sub = '')
groups <- cutree(Red_clust, k=k) # "k=" defines the number of clusters you are using
rect.hclust(Red_clust, k=k, border="red") # draw dendogra

## 每组有多少章
table(groups)
## groups
## 1 2 3 4 5 6
## 5 6 43 18 36 12
# dfgroup <- as.data.frame(groups)
k = 5
Red_clust <- hclust(d = Red_dtm_tfidfr_dist,method = "ward.D2")
Red_clust$labels <- Red_dreamname$chapter2
## 可视化绘图
par(family = "STKaiti",cex = 0.6)
plot(Red_clust,
main = '红楼梦章节聚类\nmethod = word.D2',
xlab = '', ylab = '', sub = '')
groups <- cutree(Red_clust, k=k) # "k=" defines the number of clusters you are using
rect.hclust(Red_clust, k=k, border="red") # draw dendogra

## 每组有多少章
table(groups)
## groups
## 1 2 3 4 5
## 21 6 49 30 14
# dfgroup <- as.data.frame(groups)
## 绘制各章节的关系网络,连接权重为距离系数
# ## 构建连接矩阵
# Red_dist_cut <- as.matrix(Red_dtm_tfidfr)
# ## Transform Data into an Adjacency Matrix
# # change it to a Boolean matrix
# Red_dist_cut[Red_dist_cut>=1] <- 1
# # transform into a term-term adjacency matrix
# Red_dist_cut <- Red_dist_cut %*% t(Red_dist_cut)
# # inspect terms numbered 5 to 10
# # Red_dist_cut[5:10,5:10]
summary(Red_dtm_tfidfr_dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1991 0.8721 0.9128 0.8957 0.9407 0.9888
threshoud <- 0.8
Red_dist_cut <- as.matrix(Red_dtm_tfidfr_dist)
for (ii in 1:dim(Red_dist_cut)[1]) {
for (kk in 1:dim(Red_dist_cut)[2]) {
## 距离大于的则没有连接
aa <- Red_dist_cut[ii,kk]
## 数值越小权重越大
aa <- ifelse(aa >=threshoud,0,aa)
aa <- abs(aa - threshoud)
aa <- ifelse(aa < threshoud,aa+threshoud,0)
Red_dist_cut[ii,kk] <- aa
}
}
# # plot(as.vector(Red_dist_cut))
# # names(Red_dist_cut) <- Red_dreamname$chapter2
row.names(Red_dist_cut) <- Red_dreamname$chapter
# build a graph from the above matrix
g <- graph.adjacency(Red_dist_cut, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- row.names(Red_dist_cut)
V(g)$degree <- degree(g)
## 绘制每章节的网络关系图
set.seed(3952)
par(family ="STKaiti",cex = 1)
layout1 <- layout.kamada.kawai(g)
plot(g, layout=layout1)

## 美化图形
V(g)$label.cex <- 1.2 * V(g)$degree / max(V(g)$degree) +0.4
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
E(g)$color <- rgb(.5, .9, 0, egam)
E(g)$width <- egam *4
# plot the graph in layout1
plot(g, layout=layout1,main = "《红楼梦》章节的关系")

layout2 <- layout.sphere(g)
plot(g, layout=layout2,main = "《红楼梦》章节的关系")

## 文字越大,说明与该章相关的章节数越多
## 连接的线越粗,说明联系越大
## 使用Lda主题模型进行主题挖掘分析
Red_dtmr <- removeSparseTerms(Red_dtm,0.9)
lda <- LDA(Red_dtmr, k = 6)
(term <- terms(lda, 10))
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "笑道" "凤姐" "贾政" "宝玉" "贾母" "宝玉"
## [2,] "姑娘" "贾琏" "宝玉" "黛玉" "贾珍" "凤姐"
## [3,] "一个" "老太太" "王夫人" "袭人" "笑道" "笑道"
## [4,] "探春" "奶奶" "老爷" "宝钗" "尤氏" "凤姐儿"
## [5,] "平儿" "贾母" "太太" "笑道" "贾蓉" "只见"
## [6,] "奶奶" "平儿" "众人" "贾母" "一个" "王夫人"
## [7,] "太太" "听见" "宝钗" "一个" "众人" "贾母"
## [8,] "众人" "姑娘" "袭人" "紫鹃" "凤姐" "林黛玉"
## [9,] "李纨" "一个" "一个" "湘云" "宝玉" "薛蟠"
## [10,] "鸳鸯" "告诉" "不知" "姑娘" "不知" "一个"